Multiples factors play an important function during the naive CD4-T cell differentiation where Blimp1 (encoded by Prdm1) and Bcl6 are the masters. In this notebook, we are going to characterize the accessibility pattern of these regulators.
library(Seurat)
library(Signac)
library(GenomicRanges)
library(pheatmap)
library(JASPAR2020)
library(TFBSTools)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(data.table)
library(ggpubr)
library(writexl)
library(plyr)
library(stringr)
library(rio)
cell_type <- "CD4_T"
path_to_obj <- paste0(
here::here("scATAC-seq/results/R_objects/level_5/"),
cell_type,
"/04.",
cell_type,
"_integration_peak_calling_level_5.rds",
sep = ""
)
path_to_RNA <- paste0(
here::here("scRNA-seq/3-clustering/5-level_5/"),
cell_type,
"/",
cell_type,
"_subseted_annotated_level_5.rds",
sep = ""
)
upstream <- 2000
color_palette <- c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D",
"#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0",
"#1C8356", "#FEAF16", "#822E1C", "#C4451C",
"#1CBE4F", "#325A9B", "#F6222E", "#FE00FA",
"#FBE426", "#16FF32", "black", "#3283FE",
"#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
"#90AD1C", "#FE00FA", "#85660D", "#3B00FB",
"#822E1C", "coral2", "#1CFFCE", "#1CBE4F",
"#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
"#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
"#FEAF16", "#683B79", "#B10DA1", "#1C7F93",
"#F8A19F", "dark orange", "#FEAF16",
"#FBE426", "Brown")
plot_dim <- function(seurat, group){
DimPlot(seurat,
group.by = group,
cols = color_palette,
pt.size = 0.1,raster=FALSE)
}
mat_heatmap <- function(seurat, features, group,
cutree_ncols,cutree_nrows){
my_sample_col <- data.frame(Group = rep(c("Naive", "Central Memory",
"Central Memory","Non-Tfh",
"Non-Tfh","Non-Tfh",
"Non-Tfh","Non-Tfh",
"Non-Tfh","Tfh",
"Tfh","Tfh",
"Tfh","Tfh")))
rownames(my_sample_col) = c("Naive", "CM Pre-non-Tfh","CM PreTfh",
"T-Trans-Mem","T-Eff-Mem","T-helper",
"Eff-Tregs","non-GC-Tf-regs","GC-Tf-regs" ,
"B border_Tfh T", "Tfh-LZ-GC",
"GC-Tfh-SAP","GC-Tfh-0X40","Tfh-Mem")
annoCol<-list(Group=c("Naive"="grey", "Central Memory"="black",
"Non-Tfh"="red", "Tfh"="yellow"))
avgexpr_mat <- AverageExpression(
features = features,
seurat,
assays = "peaks_level_5",
return.seurat = F,
group.by = group,
slot = "data")
p1 <- pheatmap(avgexpr_mat$peaks_level_5[,c(rownames(my_sample_col))],
scale = "row",
angle_col = 45,
show_rownames=T,
annotation_col = my_sample_col,
annotation_colors = annoCol,
border_color = "white",
cluster_rows = T,
cluster_cols = F,
fontsize_col = 10,
clustering_distance_rows = "euclidean",
clustering_method = "ward.D2",
cutree_rows = cutree_nrows)
return(p1)}
gene_reference <- here::here("scATAC-seq/Cicero/genes.gtf")
gtf <- rtracklayer::import(gene_reference)
gtf_df <- as.data.frame(gtf)
gtf_df_gene <- gtf_df[gtf_df$type == "gene",]
DT::datatable(gtf_df_gene)
# We are going to add 2000 bp upstream of each gene to take in acccount the proximal promoter regions
gtf_df_gene$start <- gtf_df_gene$start - upstream
seurat <- readRDS(path_to_obj)
seurat_peaks <- seurat@assays$peaks_level_5@ranges
seurat
## An object of class Seurat
## 93116 features across 16383 samples within 1 assay
## Active assay: peaks_level_5 (93116 features, 92407 variable features)
## 3 dimensional reductions calculated: umap, lsi, harmony
plot_dim(seurat, group = "annotation_paper")
seurat_RNA <- readRDS(path_to_RNA)
seurat_RNA
## An object of class Seurat
## 37378 features across 52307 samples within 1 assay
## Active assay: RNA (37378 features, 0 variable features)
## 3 dimensional reductions calculated: pca, umap, harmony
plot_dim(seurat_RNA, group = "annotation_paper")
At low level of resolution, we want to detect the main epigenomic changes between Non-Tfh vs Tfh. For this reason, we decide to group the cells in 4 clusters: Non-Tfh, Tfh, Central Memory and Naive.
seurat@meta.data <- seurat@meta.data %>% mutate(Group =
case_when(annotation_paper == "Naive" ~ "Naive",
annotation_paper == "CM Pre-non-Tfh" ~ "Central Memory",
annotation_paper == "CM PreTfh" ~ "Central Memory",
annotation_paper == "T-Trans-Mem" ~ "Non-Tfh",
annotation_paper == "T-Eff-Mem" ~ "Non-Tfh",
annotation_paper == "T-helper" ~ "Non-Tfh",
annotation_paper == "Tfh T:B border" ~ "Tfh",
annotation_paper == "Tfh-LZ-GC" ~ "Tfh",
annotation_paper == "GC-Tfh-SAP" ~ "Tfh",
annotation_paper == "GC-Tfh-0X40" ~ "Tfh",
annotation_paper == "Tfh-Mem" ~ "Tfh",
annotation_paper == "Memory T cells" ~ "Non-Tfh",
annotation_paper == "Eff-Tregs" ~ "Non-Tfh",
annotation_paper == "non-GC-Tf-regs" ~ "Non-Tfh",
annotation_paper == "GC-Tf-regs" ~ "Non-Tfh"))
plot_dim(seurat, group = "Group")
seurat_RNA@meta.data <- seurat_RNA@meta.data %>% mutate(Group =
case_when(annotation_paper == "Naive" ~ "Naive",
annotation_paper == "CM Pre-non-Tfh" ~ "Central Memory",
annotation_paper == "CM PreTfh" ~ "Central Memory",
annotation_paper == "T-Trans-Mem" ~ "Non-Tfh",
annotation_paper == "T-Eff-Mem" ~ "Non-Tfh",
annotation_paper == "T-helper" ~ "Non-Tfh",
annotation_paper == "Tfh T:B border" ~ "Tfh",
annotation_paper == "Tfh-LZ-GC" ~ "Tfh",
annotation_paper == "GC-Tfh-SAP" ~ "Tfh",
annotation_paper == "GC-Tfh-0X40" ~ "Tfh",
annotation_paper == "Tfh-Mem" ~ "Tfh",
annotation_paper == "Memory T cells" ~ "Non-Tfh",
annotation_paper == "Eff-Tregs" ~ "Non-Tfh",
annotation_paper == "non-GC-Tf-regs" ~ "Non-Tfh",
annotation_paper == "GC-Tf-regs" ~ "Non-Tfh"))
plot_dim(seurat_RNA, group = "Group")
The main goal of this analisys is to extract the differential expressed genes between the groups previously defined.
Idents(seurat_RNA) <- "Group"
#DE <- FindAllMarkers(
# object = seurat_RNA,
#logfc.threshold = 0.25,
#test.use = "wilcox")
#output <- split(DE, DE$cluster)
path_to_save_DE <- here::here("scATAC-seq/results/files/CD4_T/DE_4_groups.xlsx")
#write_xlsx(output, path_to_save_DE)
#DT::datatable(DE)
DE <- import_list(path_to_save_DE, setclass = "tbl", rbind = TRUE)
colnames(DE) <- c("p_val", "avg_log2FC",
"pct.1" , "pct.2", "p_val_adj",
"cluster", "gene_name", "_file")
Tfh_genes <- c("TOX2", "PDCD1","CXCL13","TOX","BCL6",
"GNG4","IL21","SH2D1A","CD200","CXCR5","POU2AF1")
Naive_genes <- c("BACH2","LEF1","CCR7","NOSIP","KLF2","SELL")
Central_memory_genes <- c("ANK3","IL7R","TXNIP","ANXA1",
"ZBTB16","GPR183","TIGIT","IL21")
Non_Tfh_genes <- c("LAG3","RORA","IKZF2","KLRB1",
"IL2RA","PRDM1","IL1R1","CTLA4",
"FOXP3","CCR6","MAF","CCL20","IL1R2")
target_genes <- c(Naive_genes,Central_memory_genes,
Tfh_genes,Non_Tfh_genes)
expr_mat <- AverageExpression(
features = target_genes,
seurat_RNA,
return.seurat = F,
group.by = "Group",
slot = "data")
pheatmap(expr_mat$RNA[target_genes,],
scale = "row",
annotation_names_row = F,
border_color = "white",
cluster_rows = F,
cluster_cols = T,
fontsize_row = 10,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2")
#write.table(unique(expr_mat$RNA[target_genes,]),
# quote = F,
# here::here("scATAC-seq/results/plots/CD4-T/files_plots/matrix_RNA_genes.tsv"))
all_genes <- gtf_df_gene[which(gtf_df_gene$gene_name %in% target_genes),]
all_genes_GR <- makeGRangesFromDataFrame(all_genes[c(1:5,13)],
keep.extra.columns = T)
all_genes_GR
## GRanges object with 37 ranges and 1 metadata column:
## seqnames ranges strand | gene_name
## <Rle> <IRanges> <Rle> | <character>
## 141854 chr1 145990435-145996579 - | TXNIP
## 188313 chr1 169688665-169711702 - | SELL
## 243943 chr1 235545685-235650754 - | GNG4
## 339680 chr2 101989960-102028544 + | IL1R2
## 339786 chr2 102062544-102179874 + | IL1R1
## ... ... ... ... . ...
## 2409954 chr19 16322817-16327874 + | KLF2
## 2485933 chr19 49553468-49590262 - | NOSIP
## 2557042 chr20 43912852-44069616 + | TOX2
## 2702697 chrX 49248436-49270477 - | FOXP3
## 2737879 chrX 124225868-124373197 + | SH2D1A
## -------
## seqinfo: 40 sequences from an unspecified genome; no seqlengths
## Overlapping of the DE genes coordinates with the total number of peaks detected.
gr1 <- seurat_peaks
gr2 <- all_genes_GR
m <- findOverlaps(gr1, gr2)
gr1.matched <- gr1[queryHits(m)]
# Add the metadata from gr2
mcols(gr1.matched) <- cbind.data.frame(
mcols(gr1.matched),
mcols(gr2[subjectHits(m)]));
gr1.matched
## GRanges object with 515 ranges and 1 metadata column:
## seqnames ranges strand | gene_name
## <Rle> <IRanges> <Rle> | <character>
## [1] chr1 145996409-145997493 * | TXNIP
## [2] chr1 169692335-169693366 * | SELL
## [3] chr1 169694288-169694677 * | SELL
## [4] chr1 169699061-169699292 * | SELL
## [5] chr1 169700944-169701648 * | SELL
## ... ... ... ... . ...
## [511] chrX 124328812-124329858 * | SH2D1A
## [512] chrX 124339089-124339448 * | SH2D1A
## [513] chrX 124342323-124342811 * | SH2D1A
## [514] chrX 124346063-124346707 * | SH2D1A
## [515] chrX 124358836-124359410 * | SH2D1A
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
gr1.matched$peaks <- paste0(seqnames(gr1.matched),"-",
start(gr1.matched),"-",
end(gr1.matched))
gr1.matched_df <- as.data.frame(gr1.matched)
my_sample_col <- data.frame(Gene = c(gr1.matched$gene_name))
rownames(my_sample_col) = unique(gr1.matched$peaks)
avgexpr_mat <- AverageExpression(
features = unique(gr1.matched$peaks),
seurat,
assays = "peaks_level_5",
return.seurat = F,
group.by = "Group",
slot = "data")
avgexpr_df <- as.data.frame(avgexpr_mat$peaks_level_5)
avgexpr_df$peaks <- row.names(avgexpr_df)
DA_DE_merge <- merge(avgexpr_df,
gr1.matched_df[c("peaks","gene_name")],
by=c("peaks"))
DA_DE_merge_melt <- melt(DA_DE_merge)
# Computing the mean accessibility/expression per gene
mean_accessibility <- tapply(DA_DE_merge_melt$value,
list(DA_DE_merge_melt$gene_name, DA_DE_merge_melt$variable),
mean)
out <- pheatmap(mean_accessibility[target_genes,],
scale = "row",
border_color = "white",
cluster_rows = F,
cluster_cols = T,
fontsize_row = 10,
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2")
#write.table(unique(mean_accessibility[target_genes,]),
# quote = F,
# here::here("scATAC-seq/results/plots/CD4-T/files_plots/matrix_ATAC_genes.tsv"))
DA_DE_merge <- merge(melt(mean_accessibility),
melt(expr_mat$RNA),
by=c("Var1","Var2"))
colnames(DA_DE_merge) <- c("Gene" ,"Clusters","Accesibility","Expresion")
DA_DE_merge.melt <- melt(DA_DE_merge)
# Filtering conditions
df_Naive <- filter(
DA_DE_merge.melt,Clusters == "Naive" & Gene %in% Naive_genes)
df_CM <- filter(
DA_DE_merge.melt,Clusters == "Central Memory" & Gene %in% Central_memory_genes)
df_Tfh <- filter(
DA_DE_merge.melt,Clusters == "Tfh" & Gene %in% Tfh_genes)
df_Non_Tfn <- filter(
DA_DE_merge.melt,Clusters == "Non-Tfh" & Gene %in% Non_Tfh_genes)
selection_df <- rbind(df_Naive,df_CM,df_Tfh,df_Non_Tfn)
selection_df$value <- scale(selection_df$value)
ggdotchart(selection_df,
x="Gene",
y="value",
add = "segments") +
coord_flip() + facet_grid(vars(Clusters), vars(variable), scales = "free_y")
bcl6 <- gtf_df_gene[which(gtf_df_gene$gene_name %in% "BCL6"),]
bcl6_gr <- makeGRangesFromDataFrame(bcl6)
bcl6_plot <- CoveragePlot(
object = seurat,
region = bcl6_gr)
bcl6_plot
overlapping_bcl6 <- seurat_peaks[queryHits(findOverlaps(seurat_peaks,
bcl6_gr)),]
features <- paste0(seqnames(overlapping_bcl6),"-",
start(overlapping_bcl6),"-",
end(overlapping_bcl6))
#pdf(file = here::here("scATAC-seq/results/plots/CD4-T/bcl6_heatmaps.pdf"),
# width = 10,
# height = 4)
print(mat_heatmap(seurat = seurat,
features = features,
group = "annotation_paper",
cutree_ncols = 3,cutree_nrows = 1))
#dev.off()
prdm1 <- gtf_df_gene[which(gtf_df_gene$gene_name %in% "PRDM1"),]
prdm1_gr <- makeGRangesFromDataFrame(prdm1)
prdm1_plot <-CoveragePlot(
object = seurat,
region = prdm1_gr)
prdm1_plot
overlapping_prdm1 <- seurat_peaks[queryHits(findOverlaps(seurat_peaks, prdm1_gr)),]
features <- paste0(seqnames(overlapping_prdm1),"-",
start(overlapping_prdm1),"-",
end(overlapping_prdm1))
#pdf(file = here::here("scATAC-seq/results/plots/CD4-T/prdm1_heatmaps.pdf"),
# width = 10,
# height = 4)
print(mat_heatmap(seurat = seurat,
features = features,
group = "annotation_paper",
cutree_ncols = 3,cutree_nrows = 1))
#dev.off()
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rio_0.5.16 plyr_1.8.6 writexl_1.4.0 ggpubr_0.4.0 data.table_1.14.0 forcats_0.5.0 stringr_1.4.0 purrr_0.3.4 readr_1.4.0 tidyr_1.1.3 tibble_3.1.2 tidyverse_1.3.0 ggplot2_3.3.5 dplyr_1.0.7 TFBSTools_1.26.0 JASPAR2020_0.99.10 pheatmap_1.0.12 GenomicRanges_1.40.0 GenomeInfoDb_1.24.2 IRanges_2.22.1 S4Vectors_0.26.0 BiocGenerics_0.34.0 Signac_1.2.1 SeuratObject_4.0.2 Seurat_4.0.3 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.1 reticulate_1.20 R.utils_2.10.1 tidyselect_1.1.1 poweRlaw_0.70.6 RSQLite_2.2.1 AnnotationDbi_1.50.3 htmlwidgets_1.5.3 grid_4.0.3 docopt_0.7.1 BiocParallel_1.22.0 Rtsne_0.15 munsell_0.5.0 codetools_0.2-17 ica_1.0-2 DT_0.16 future_1.21.0 miniUI_0.1.1.1 withr_2.4.2 colorspace_2.0-2 Biobase_2.48.0 knitr_1.30 rstudioapi_0.11 ROCR_1.0-11 ggsignif_0.6.0 tensor_1.5 listenv_0.8.0 labeling_0.4.2 slam_0.1-47 GenomeInfoDbData_1.2.3 polyclip_1.10-0 bit64_4.0.5 farver_2.1.0 rprojroot_2.0.2 parallelly_1.26.1 vctrs_0.3.8 generics_0.1.0 xfun_0.18 lsa_0.73.2 ggseqlogo_0.1 R6_2.5.0 bitops_1.0-7
## [43] spatstat.utils_2.2-0 DelayedArray_0.14.0 assertthat_0.2.1 promises_1.2.0.1 scales_1.1.1 gtable_0.3.0 globals_0.14.0 goftest_1.2-2 seqLogo_1.54.3 rlang_0.4.11 RcppRoll_0.3.0 splines_4.0.3 rstatix_0.6.0 rtracklayer_1.48.0 lazyeval_0.2.2 broom_0.7.2 spatstat.geom_2.2-0 modelr_0.1.8 BiocManager_1.30.10 yaml_2.2.1 reshape2_1.4.4 abind_1.4-5 crosstalk_1.1.1 backports_1.1.10 httpuv_1.6.1 tools_4.0.3 bookdown_0.21 ellipsis_0.3.2 spatstat.core_2.2-0 RColorBrewer_1.1-2 ggridges_0.5.3 Rcpp_1.0.6 zlibbioc_1.34.0 RCurl_1.98-1.2 rpart_4.1-15 deldir_0.2-10 pbapply_1.4-3 cowplot_1.1.1 zoo_1.8-9 haven_2.3.1 SummarizedExperiment_1.18.1 ggrepel_0.9.1
## [85] cluster_2.1.0 here_1.0.1 fs_1.5.0 magrittr_2.0.1 scattermore_0.7 openxlsx_4.2.4 reprex_0.3.0 lmtest_0.9-38 RANN_2.6.1 SnowballC_0.7.0 fitdistrplus_1.1-5 matrixStats_0.59.0 hms_0.5.3 patchwork_1.1.1 mime_0.11 evaluate_0.14 xtable_1.8-4 XML_3.99-0.3 readxl_1.3.1 sparsesvd_0.2 gridExtra_2.3 compiler_4.0.3 KernSmooth_2.23-17 crayon_1.4.1 R.oo_1.24.0 htmltools_0.5.1.1 mgcv_1.8-33 later_1.2.0 lubridate_1.7.9 DBI_1.1.0 tweenr_1.0.1 dbplyr_1.4.4 MASS_7.3-53 car_3.0-10 Matrix_1.3-4 cli_3.0.0 R.methodsS3_1.8.1 igraph_1.2.6 pkgconfig_2.0.3 GenomicAlignments_1.24.0 TFMPvalue_0.0.8 foreign_0.8-80
## [127] plotly_4.9.4.1 spatstat.sparse_2.0-0 xml2_1.3.2 annotate_1.66.0 DirichletMultinomial_1.30.0 XVector_0.28.0 rvest_0.3.6 digest_0.6.27 sctransform_0.3.2 RcppAnnoy_0.0.18 pracma_2.2.9 CNEr_1.24.0 spatstat.data_2.1-0 Biostrings_2.56.0 cellranger_1.1.0 rmarkdown_2.5 leiden_0.3.8 fastmatch_1.1-0 uwot_0.1.10 curl_4.3.2 shiny_1.6.0 Rsamtools_2.4.0 gtools_3.9.2 lifecycle_1.0.0 nlme_3.1-150 jsonlite_1.7.2 carData_3.0-4 viridisLite_0.4.0 BSgenome_1.56.0 fansi_0.5.0 pillar_1.6.1 lattice_0.20-41 KEGGREST_1.28.0 fastmap_1.1.0 httr_1.4.2 survival_3.2-7 GO.db_3.11.4 glue_1.4.2 zip_2.1.1 qlcMatrix_0.9.7 png_0.1-7 bit_4.0.4
## [169] ggforce_0.3.2 stringi_1.6.2 blob_1.2.1 caTools_1.18.2 memoise_1.1.0 irlba_2.3.3 future.apply_1.7.0